# load raw data files
data <- read.csv("../data/filledDatabase111119NUMONLY.csv")
# clean data
data <- clean_data(data) %>% collapse_data()
# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X"))
data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()
# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)
Multinomial Regression
library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()
Coefficients
Ridge
ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial")
ridge_cv %>% get_coef(tuning_parameter = ridge_cv$lambda.min) %>% plot_coef()

LASSO
lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>% get_coef(tuning_parameter = lasso_cv$lambda.min) %>% plot_coef()

Elastic Net
library(caret)
elastic_cv <-
train(GroupCat ~., data = data, method = "glmnet",
trControl = trainControl("cv", number = 5),
tuneLength = 10
)
elastic_cv$finalModel %>% get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>% plot_coef()

Accurate classification rate
Ridge
tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min)
tb_ridge$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.6477273
|
0.625
|
0.6741573
|
0.5340909
|
0.6704545
|
0.630286
|
tb_ridge$t %>% highlight_tb_count()
|
|
16
|
3
|
5
|
6
|
Other
|
|
16
|
68
|
12
|
9
|
2
|
20
|
|
3
|
15
|
111
|
34
|
9
|
6
|
|
5
|
10
|
18
|
66
|
0
|
5
|
|
6
|
8
|
2
|
0
|
28
|
0
|
|
Other
|
6
|
1
|
6
|
0
|
5
|
|
Total
|
107
|
144
|
115
|
39
|
36
|
tb_ridge$t %>% highlight_tb_percent()
|
|
16
|
3
|
5
|
6
|
Other
|
|
16
|
0.64
|
0.08
|
0.08
|
0.05
|
0.56
|
|
3
|
0.14
|
0.77
|
0.3
|
0.23
|
0.17
|
|
5
|
0.09
|
0.12
|
0.57
|
0
|
0.14
|
|
6
|
0.07
|
0.01
|
0
|
0.72
|
0
|
|
Other
|
0.06
|
0.01
|
0.05
|
0
|
0.14
|
|
Total
|
100%
|
100%
|
100%
|
100%
|
100%
|
LASSO
tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min)
tb_lasso$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.6590909
|
0.6136364
|
0.6741573
|
0.5340909
|
0.6590909
|
0.6280133
|
tb_lasso$t %>% highlight_tb_count()
|
|
16
|
3
|
5
|
6
|
Other
|
|
16
|
66
|
9
|
11
|
2
|
18
|
|
3
|
14
|
111
|
33
|
9
|
7
|
|
5
|
10
|
20
|
67
|
0
|
5
|
|
6
|
9
|
2
|
0
|
28
|
1
|
|
Other
|
8
|
2
|
4
|
0
|
5
|
|
Total
|
107
|
144
|
115
|
39
|
36
|
tb_lasso$t %>% highlight_tb_percent()
|
|
16
|
3
|
5
|
6
|
Other
|
|
16
|
0.62
|
0.06
|
0.1
|
0.05
|
0.5
|
|
3
|
0.13
|
0.77
|
0.29
|
0.23
|
0.19
|
|
5
|
0.09
|
0.14
|
0.58
|
0
|
0.14
|
|
6
|
0.08
|
0.01
|
0
|
0.72
|
0.03
|
|
Other
|
0.07
|
0.01
|
0.03
|
0
|
0.14
|
|
Total
|
100%
|
100%
|
100%
|
100%
|
100%
|
Elastic Net
tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]])
tb_elastic$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.6363636
|
0.6363636
|
0.7078652
|
0.5681818
|
0.625
|
0.6347549
|
tb_elastic$t %>% highlight_tb_count()
|
|
16
|
3
|
5
|
6
|
Other
|
|
16
|
66
|
9
|
10
|
2
|
18
|
|
3
|
14
|
109
|
32
|
8
|
6
|
|
5
|
11
|
20
|
68
|
0
|
3
|
|
6
|
7
|
3
|
0
|
29
|
1
|
|
Other
|
9
|
3
|
5
|
0
|
8
|
|
Total
|
107
|
144
|
115
|
39
|
36
|
tb_elastic$t %>% highlight_tb_percent()
|
|
16
|
3
|
5
|
6
|
Other
|
|
16
|
0.62
|
0.06
|
0.09
|
0.05
|
0.5
|
|
3
|
0.13
|
0.76
|
0.28
|
0.21
|
0.17
|
|
5
|
0.1
|
0.14
|
0.59
|
0
|
0.08
|
|
6
|
0.07
|
0.02
|
0
|
0.74
|
0.03
|
|
Other
|
0.08
|
0.02
|
0.04
|
0
|
0.22
|
|
Total
|
100%
|
100%
|
100%
|
100%
|
100%
|